{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "2c86c7be",
   "metadata": {},
   "source": [
    "# Chapter 2 - Data Standardization"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a7b5ca0d",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import requests\n",
    "from bs4 import BeautifulSoup\n",
    "\n",
    "# After import you can jump to Step 2 if you wish to skip the data acquisition and use the data snapshot provided in the repo"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "9c055240",
   "metadata": {},
   "source": [
    "## Step 1 - Acquire Data"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "cf81046d",
   "metadata": {},
   "source": [
    "### Wikipedia"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "bcbd8f6a",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Get the Wikipedia webpage and find all tables.\n",
    "\n",
    "url = \"https://en.wikipedia.org/wiki/List_of_MPs_elected_in_the_2019_United_Kingdom_general_election\"\n",
    "\n",
    "website_url = requests.get(url).text\n",
    "soup = BeautifulSoup(website_url,'html.parser')\n",
    "tables = soup.find_all('table')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "647b6d9b",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Find the table with members returned, extract rows as a list of lists and load into dataframe\n",
    "\n",
    "for table in tables:\n",
    "    if 'Member returned' in table.text:\n",
    "        headers = [header.text.strip() for header in table.find_all('th')]\n",
    "        headers = headers[:5]\n",
    "        dfrows = []\n",
    "        table_rows = table.find_all('tr')    \n",
    "        for row in table_rows:\n",
    "            td = row.find_all('td')\n",
    "            dfrow = [row.text for row in td if row.text!='\\n']\n",
    "            dfrows.append(dfrow)\n",
    "\n",
    "df_w = pd.DataFrame(dfrows, columns=headers)            "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "df654f78",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_w.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7ffb004c",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_w.head(n=5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "de90537c",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Remove unwanted columns and check length\n",
    "\n",
    "df_w = df_w[['Constituency','Member returned','Notes']]\n",
    "len(df_w)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "5cbd19ee",
   "metadata": {},
   "source": [
    "### They Work For You"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c32d2dbd",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Download all current Members of Parliament\n",
    "\n",
    "url = \"https://www.theyworkforyou.com/mps/?f=csv\"\n",
    "df_t = pd.read_csv(url, header=0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4250cf7e",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_t.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7866e5a6",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_t.head(n=5)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "1d8cbd3a",
   "metadata": {},
   "source": [
    "### Add facebook links"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b98ef249",
   "metadata": {},
   "outputs": [],
   "source": [
    "def facelink(url):\n",
    "    website_url = requests.get(url).text\n",
    "    soup = BeautifulSoup(website_url,'html.parser')\n",
    "    flinks = [f\"{item['href']}\" for item in soup.select(\n",
    "            \"a[href*='facebook.com']\")]\n",
    "    if flinks[0]!=\"https://www.facebook.com/TheyWorkForYou\":\n",
    "        return(flinks[0])\n",
    "    else:\n",
    "        return(\"\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3503c936",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_t['Flink'] = df_t.apply(lambda x: facelink(x.URI), axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0f5e1b72",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_t.head(n=5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "96df144e",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Remove unwanted columns and check length\n",
    "\n",
    "df_t = df_t[['Constituency','First name','Last name','Flink']]\n",
    "len(df_t)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "3afe535e",
   "metadata": {},
   "source": [
    "### Saving to Local Storage"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b9661fcd",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Save file to local storage\n",
    "# Note uncommenting these lines will overwrite the repo snapshot and may result \n",
    "# in different values in this and subsequent chapters\n",
    "\n",
    "#df_t.to_csv('mps_they_raw.csv', index=False)\n",
    "#df_w.to_csv('mps_wiki_raw.csv')"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "579b7664",
   "metadata": {},
   "source": [
    "## Step 2 - Data Cleansing"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "41cd5eae",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Theyworkforyou update their website after MP resignations, by-elections and general elections.\n",
    "\n",
    "# To ensure consistency with the data at the time of writing the following statements overwrite\n",
    "# the 'wiki' and 'they' raw dataframes with the snapshot provided in the repo\n",
    "\n",
    "df_t = pd.read_csv('mps_they_raw.csv')\n",
    "df_w = pd.read_csv('mps_wiki_raw.csv')"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "69fa54b9",
   "metadata": {},
   "source": [
    "#### Wikipedia Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "287f3344",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_w.head(n=5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4f69a8a1",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_w.tail(n=5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "babd8461",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Rename columns for consistency\n",
    "\n",
    "df_w = df_w.rename(columns={ 'Member returned' : 'Fullname'})\n",
    "\n",
    "# Remove null rows at start and end and strip tailing '\\n'\n",
    "\n",
    "df_w = df_w.dropna()\n",
    "df_w['Constituency'] = df_w['Constituency'].str.rstrip(\"\\n\")\n",
    "df_w['Fullname'] = df_w['Fullname'].str.rstrip(\"\\n\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "30285938",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Check \\n not elsewhere in Fullname\n",
    "\n",
    "df_w[df_w['Fullname'].astype(str).str.contains('\\n')]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6644cd0e",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Strip leading '\\n' \n",
    "\n",
    "df_w['Fullname'] = df_w['Fullname'].str.lstrip(\"\\n\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5518e582",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Check \\n not elsewhere in Constituency\n",
    "\n",
    "df_w[df_w['Constituency'].astype(str).str.contains('\\n')]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a358bcb2",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Split into Firstname and compound Lastname\n",
    "\n",
    "df_w['Firstname'] = df_w['Fullname'].str.split().str[0]\n",
    "df_w['Lastname'] = df_w['Fullname'].astype(str).apply(lambda x: ' '.join(x.split()[1:]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4cc7c529",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Check for compound lastnames\n",
    "\n",
    "df_w[df_w['Lastname'].astype(str).str.contains(' ')]['Lastname']"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c5fd1490",
   "metadata": {},
   "source": [
    "#### They Work for You Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "963ff01b",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_t = df_t.rename(columns={'Last name' : 'Lastname', 'First name' : 'Firstname'})"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "9b99a3ff",
   "metadata": {},
   "source": [
    "### Calculate Exact Match Counts "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5c902041",
   "metadata": {},
   "outputs": [],
   "source": [
    "# All matching columns\n",
    "\n",
    "len(df_w.merge(df_t, on=['Constituency','Firstname','Lastname']))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "53b6da8c",
   "metadata": {},
   "outputs": [],
   "source": [
    "len(df_w.merge(df_t, on=['Constituency','Lastname']))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "abd507ce",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Match on First name and Last name\n",
    "\n",
    "len(df_w.merge(df_t, on=['Firstname','Lastname'] ))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "79ada167",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Match on Consistency and Last name\n",
    "\n",
    "len(df_w.merge(df_t, on=['Constituency','Lastname'] ))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6f36dafe",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Match on Consistency and First name\n",
    "\n",
    "len(df_w.merge(df_t, on=['Constituency','Firstname'] ))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "46f6dad1",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Match on Lastname\n",
    "\n",
    "len(df_w.merge(df_t, on=['Lastname'] ))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7b0667b4",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Match on Firstname\n",
    "\n",
    "len(df_w.merge(df_t, on=['Firstname'] ))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ef47143d",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Match on Consistency\n",
    "\n",
    "len(df_w.merge(df_t, on=['Constituency'] ))"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "60c086bf",
   "metadata": {},
   "source": [
    "## Step 3 - Further Cleansing"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "bee621a1",
   "metadata": {},
   "source": [
    "### Consitutency"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d0beffbe",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_w_outer = df_w.merge(df_t, on=['Constituency'],how=\"outer\",indicator=True)\n",
    "df_w_outer[df_w_outer['_merge']=='right_only']['Constituency'].head(n=5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e1249362",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_w_outer[df_w_outer['_merge']=='left_only']['Constituency'].head(n=5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "594794c8",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Remove commas from both dataframes\n",
    "\n",
    "df_t['Constituency'] = df_t['Constituency'].str.replace(',', '')\n",
    "df_w['Constituency'] = df_w['Constituency'].str.replace(',', '')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "bd21884a",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Match on Consistency\n",
    "\n",
    "len(df_w.merge(df_t, on=['Constituency']))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7e93846a",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Repeat perfect match count\n",
    "\n",
    "len(df_w.merge(df_t, on=['Constituency','Firstname','Lastname']))"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "5e68ad62",
   "metadata": {},
   "source": [
    "### Firstname"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c10a6629",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_w_inner = df_w.merge(df_t, on=['Constituency'], suffixes=('_w', '_t'))\n",
    "df_w_inner[(df_w_inner['Firstname_w'] != df_w_inner['Firstname_t']) & (df_w_inner['Lastname_w'] != df_w_inner['Lastname_t'])]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "93b4203f",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_w_inner = df_w.merge(df_t, on=['Constituency'], suffixes=('_w', '_t'))\n",
    "df_w_inner[(df_w_inner['Firstname_w'] == df_w_inner['Firstname_t']) & (df_w_inner['Lastname_w'] != df_w_inner['Lastname_t']) |\n",
    "         (df_w_inner['Firstname_w'] != df_w_inner['Firstname_t']) & (df_w_inner['Lastname_w'] == df_w_inner['Lastname_t'])]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "19d0d89c",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Remove extra suffixes in TheyWorkForYou Firstnames\n",
    "\n",
    "df_t['Firstname'] = df_t['Firstname'].str.split().str[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "10d2705e",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Final resolved match count\n",
    "\n",
    "df_resolved = df_w.merge(df_t, on=['Firstname','Lastname'] )\n",
    "len(df_resolved)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c0019aa0",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_w_inner = df_w.merge(df_t, on=['Constituency'], suffixes=('_w', '_t'))\n",
    "df_w_unmatched = df_w_inner[(df_w_inner['Firstname_w'] == df_w_inner['Firstname_t']) & (df_w_inner['Lastname_w'] != df_w_inner['Lastname_t']) |\n",
    "         (df_w_inner['Firstname_w'] != df_w_inner['Firstname_t']) & (df_w_inner['Lastname_w'] == df_w_inner['Lastname_t'])]\n",
    "df_w_unmatched"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "0d3f7d1b",
   "metadata": {},
   "source": [
    "## Sample Problem: Find MPs who held their seat and currently have Facebook account"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d7bdb373",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_resolved.head(n=5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5b980ae2",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Select those records with a non-null Facebook reference\n",
    "\n",
    "df_heldwithface = df_resolved[(df_resolved['Flink']!=\"\") & (df_resolved['Notes']==\"Seat held\\n\")]\n",
    "len(df_heldwithface)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "4c57ca34",
   "metadata": {},
   "source": [
    "## Save Files for Subsequent Chapters"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "774069f3",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Save file to local storage\n",
    "# Note uncommenting these lines will overwrite the repo snapshot and may result \n",
    "# in different values in this and subsequent chapters\n",
    "\n",
    "# Save unmatched to pick up in Chapter 3\n",
    "\n",
    "#df_w_unmatched.to_csv('mps_unmatched.csv', index=False)\n",
    "\n",
    "#df_w.to_csv('mps_wiki_clean.csv', index=False)\n",
    "#df_t.to_csv('mps_they_clean.csv', index=False)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "handsonentityresolution",
   "language": "python",
   "name": "handsonentityresolution"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
